UCI has a number of datasets related to machine learning. We will leverage the Bank Marketing dataset. Look into this link for more information https://archive.ics.uci.edu/ml/datasets/Bank+Marketing
Load the train and test datasets
In [1]:
train <-
test <-
y - has the client subscribed a term deposit? (binary: 'yes','no')
In [2]:
#Find frequency of deposit
In [6]:
# Train:
print("Train")
#Code here
print("Test")
#Code here
In [7]:
# Find number of rows and columns in train and test
print("Train")
#Code here
print("Test")
#Code here
Out[7]:
Out[7]:
In [9]:
# Find names of the features in train and test
print("Train")
#Code here
print("Test")
#Code here
Out[9]:
Out[9]:
Exercise
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [13]:
#Print the head of train
head(train)
Out[13]:
Exercise
In [7]:
#Print the head of test dataset
Out[7]:
In [3]:
lapply(train, class)
Out[3]:
Exercise
In [11]:
#Find the classes for the features in test dataset. Does it match the train dataset?
In [ ]:
Exercise
In [14]:
#Find if train and test have missing values
In [15]:
#Does train has missing values?
In [ ]:
In [16]:
#Does test has missing values?
In [ ]:
In [19]:
#Find column names of train and test
Out[19]:
In [14]:
# Find summary statistics of train
Out[14]:
In [15]:
#Find summary statistics of test
Out[15]:
In [17]:
#Find correlation between age and deposit in train dataset
Out[17]:
In [20]:
#Find correlation between campaign and deposit in train dataset
Out[20]:
In [21]:
#What is the standard deviation of age in train?
Out[21]:
In [22]:
#What is the mean of age in train?
Out[22]:
In [ ]:
In [30]:
# Logit Function
In [44]:
x <- seq(0,1, length=100)
x <- x[2:(length(x)-1)]
In [45]:
logit <- function (t) {
log( t / (1-t) )
}
In [46]:
plot(x~logit(x), type="l")
In [47]:
inv_logit <- function(x){
exp(x)/(1+exp(x))
}
In [51]:
y <- seq(-100,100, length=200)
In [50]:
plot(y~inv_logit(y), type="l")
Exercise
Plot the inverse logit for values between -3 and +3
In [ ]:
In [52]:
#Running the model on train
In [54]:
model <- glm(deposit~., family=binomial(link="logit"), data=train)
In [55]:
summary(model)
Out[55]:
In [56]:
#Predict on test
test_prediction <- predict(model, test, type="response")
In [57]:
head(test_prediction)
Out[57]:
In [58]:
class(test_prediction)
Out[58]:
In [59]:
library(ROCR)
In [64]:
# True Positive Rate
# False Positive Rate
# Area Under the Curve
# Precision
# Recall
In [66]:
pr <- prediction(test_prediction, test$deposit)
In [67]:
prf <- performance(pr, measure = "tpr", x.measure = "fpr")
In [79]:
plot(prf)
In [69]:
#Area under curve
auc <- performance(pr, measure = "auc")
In [70]:
auc <- auc@y.values[[1]]
In [71]:
auc
Out[71]:
In [80]:
#Precision and Recall
precision_recall <- performance(pr, "prec", "rec")
plot(precision_recall)